When crawling websites tith relative URLs, make them absolute

itkevin 12 years ago
parent
commit
fd8761177f
2 changed files with 4 additions and 1 deletions
  1. 3 0
      app/models/agents/website_agent.rb
  2. 1 1
      spec/models/agents/website_agent_spec.rb

+ 3 - 0
app/models/agents/website_agent.rb

@@ -99,6 +99,9 @@ module Agents
99 99
           result = {}
100 100
           options[:extract].keys.each do |name|
101 101
             result[name] = output[name][index]
102
+            if name.to_s == 'url'
103
+              result[name] = URI.join( options[:url], result[name]).to_s if (result[name] =~ URI::DEFAULT_PARSER.regexp[:ABS_URI]).nil?
104
+            end
102 105
           end
103 106
 
104 107
           if !options[:mode] || options[:mode].to_s == "all" || (options[:mode].to_s == "on_change" && !previous_payloads.include?(result.to_json))

+ 1 - 1
spec/models/agents/website_agent_spec.rb

@@ -68,7 +68,7 @@ describe Agents::WebsiteAgent do
68 68
       rel.save!
69 69
       rel.check
70 70
       event = Event.last
71
-      event.payload[:url].should == "http://imgs.xkcd.com/about"
71
+      event.payload[:url].should == "http://xkcd.com/about"
72 72
     end
73 73
         
74 74
     describe "JSON" do